df_audio_features <- df_audio_features_raw %>%
group_by(track_name, external_urls_spotify) %>%
mutate(artist_all = paste(artist_name, collapse = ",|,")) %>%
ungroup() %>%
mutate(artist_key = sub(",|,.*", "", artist_all)) %>%
dplyr::select(artist_name, artist_all, artist_key, everything(.)) %>%
distinct(artist_key, external_urls_spotify, .keep_all = T) %>%
as.data.frame()
Error in df_audio_features_raw %>% group_by(track_name, external_urls_spotify) %>% :
could not find function "%>%"
cant_marketsaudio_features Y charts#Armamos un join para tener una tabla de charts con las caracteristicas de las canciones
# deberian quedar 22993 filas completas
join_audio_charts <- df_audio_features %>%
select("artist_name","artist_all","artist_key",
"track_name", "external_urls_spotify", "album_name", "album_release_year",
all_of(features_continuas), all_of(features_categoricas)) %>%
right_join( df_charts,# %>%
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist",
"external_urls_spotify" = "URL"))
#HAY CHARTS QUE NO TIENEN FEATURES. HAY QUE TENERLO EN CUENTA PARA EL ANÁLISIS
library(mice)
md.pattern(join_audio_charts, rotate.names = TRUE)
popularidad[is.na(popularidad$indicador),]
#Agregación de todas las semanas en charts
##histograma de las variables continuas de audio_features
for (i in features_continuas){
hist(df_audio_features[,i], main = paste("Histograma de", i, "(all data)"), xlab = i)
abline(v = mean(df_audio_features[,i], na.rm = TRUE) , col="red")
abline(v = median(df_audio_features[,i], na.rm = TRUE) , col="blue")
legend("topright", legend = c("Media", "Mediana"), col=c("red", "blue"), lty =1)
}
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
##histograma de las variables continuas de charts
for (i in c(features_continuas, "Streams")){
hist(join_audio_charts[,i], main = paste("Histograma de", i, "(charts)"), xlab = i)
abline(v = mean(join_audio_charts[,i], na.rm = TRUE) , col="red")
abline(v = median(join_audio_charts[,i], na.rm = TRUE) , col="blue")
}
#divido features de charts según su distribución
audio_charts_continuas_media <- c('duration_ms', 'valence')
audio_charts_continuas_mediana <- c('danceability', 'acousticness', 'tempo', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets', "Streams")
##medidas resumen y barplots de las variables categoricas audio_features
for(i in features_categoricas){
barplot(sort(table(df_audio_features[,i]),decreasing = T), las=2,
main = paste("Barplot de", i, "(all data)"))
# pie(table(df_features_categoricos[,i]))
}
##medidas resumen y barplots de las variables categoricas join_audio_charts
for(i in features_categoricas){
barplot(sort(table(join_audio_charts[,i]),decreasing = T), las=2,
main = paste("Barplot de", i, "(charts)"))
# pie(table(df_features_categoricos[,i]))
}
markets_concat#Hago un join al revés
df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","markets_concat")]
join_barplot <- df_audio_features_tojoin %>%
select("track_name","artist_key","external_urls_spotify","markets_concat") %>%
left_join( df_chart_tojoin %>%
select("Track_Name", "Artist", "URL","isinchart"),
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist",
"external_urls_spotify" = "URL"))
join_barplot$isinchart[is.na(join_barplot$isinchart)] <- 0
join_barplot$isinchart <- factor(join_barplot$isinchart)
tabla_isinchart <- table(unlist(lapply(join_barplot[join_barplot$isinchart==1,"markets_concat"], function(x) strsplit(as.character(x), ','))))
tabla_notinchart <- table(unlist(lapply(join_barplot[join_barplot$isinchart==0,"markets_concat"], function(x) strsplit(as.character(x), ','))))
all_countries <- names(tabla_isinchart)
xlabs <- paste(paste(head(all_countries,3), collapse = ","),"...",paste(tail(all_countries,3),collapse = ","),"(ISO-Codes de Paises)", collapse = ",")
options(scipen=999)
par(mfrow = c(1,2), las=1, mar=c(3,3,5,3), oma=c(0,1,1,1))
barplot(sort(tabla_isinchart, decreasing = TRUE), names.arg="", main ='En Charts',col=rgb(0.2,0.4,0.6,0.6),xlab = "Paises (ISO-Codes)")
# mtext(side = 1, text = xlabs, line = 1)
barplot(sort(tabla_notinchart, decreasing = TRUE), names.arg = "", main='Fuera de Charts',col=rgb(0.2,0.4,0.6,0.6), xlab = "Paises (ISO-Codes)")
mtext(side = 1, text = xlabs, line = 1, adj = 2)
mtext("Frecuencia de mercados habilitados", side = 3, line = -1, outer = TRUE, cex = 1.3, font =2 )
# mtext("Paises (ISO-Codes)", side = 3, line = -25, outer = TRUE)
#correlaciones en audio features
x <- cor(df_audio_features[,c(features_continuas_media, features_continuas_mediana)], use = "complete.obs")
corrplot(x, type = "upper", title = "Correlacion de atributos de audio_features", mar=c(0,0,1,0), method="number" ,number.cex=0.7)
#correlaciones en charts
x <- cor(scale(join_audio_charts[,c(audio_charts_continuas_media, audio_charts_continuas_mediana)]), use = "complete.obs")
corrplot(x, type = "upper", title = "Correlacion de atributos de los Charts", mar=c(0,0,1,0), method="number", number.cex=0.7 )
#chi2 test #con n grande no se puede usar este test
tabla_key_album <- table(df_audio_features$key_name, df_audio_features$album_type)
cat("Tabla de contigencia entre key y album type\n")
tabla_key_album
chisq.test(tabla_key_album)
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
all_features <- c(features_continuas_media, features_continuas_mediana)
par(mfrow=c(4,3))
for (feature in all_features){
boxplot(df_audio_features[,feature], las=2, horizontal=T, main=feature)
}
Con excepción de valence el resto de las features poseían cierto sesgo. Se decidió transformar las variables que mayor sesgo poseían: duration_ms, instrumentalness, liveness, speechiness como método de corregir la distribución y achicar la cantidad de outliers. La variable loudness_reg_imp no fue modificada debido a que al ser negativa
# "danceability,tempo,valence,acousticness,duration_ms,energy,instrumentalness,liveness,speechiness,cant_markets"
#sesgos d las variables
sort(apply(df_audio_features[,features_continuas], MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
variables_sesgo <- unlist(strsplit("acousticness,duration_ms,instrumentalness,liveness,speechiness,cant_markets,energy", ","))
df_sesgadas <- df_audio_features[,variables_sesgo]
logaritmo_ajustado = function(x,delta){
if (x==0.0){
return(log(0.00+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
df_sesgadas_log_adjust <- data.frame(apply(df_audio_features[,variables_sesgo], MARGIN = c(1,2),
function(x) logaritmo_ajustado(x,delta)))
# names(df_sesgadas_log_adjust) <- paste(names(df_sesgadas), "_log", sep="")
names(df_sesgadas_log_adjust) <- names(df_sesgadas)
df_datos <- cbind(df_sesgadas, df_sesgadas_log_adjust)
a <- df_sesgadas
b <- df_sesgadas_log_adjust
names(b) <- paste(names(df_sesgadas), "_log", sep="")
merged <- cbind(a,b)
merged <- merged[, order(names(merged))]
round(sort(apply(merged, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)})),2)
variables_plot <- unlist(strsplit("duration_ms", ","))
variables_plot <- append(variables_plot,paste(variables_plot,"_log", sep=""))
variables_plot <- variables_plot[order(variables_plot)]
plotear <- merged[,variables_plot]
par(mfrow = c(1,2))
for (col in names(plotear)){
hist(plotear[,col], breaks="FD", main=col, xlab="")
}
summary(df_audio_features[,all_features])
hist(log(df_audio_features$duration_ms))
transformacion <- c('instrumentalness','loudness','liveness','speechiness', 'duration_ms')
logaritmo_ajustado = function(x,delta){
if (x<=0.0){
return(log(0.00+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
par(mfrow=c(2,4))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(unlist(lapply(df_audio_features[,feature], function(x) logaritmo_ajustado(x,delta))), main=paste(feature,"log", sep="_"))
}
inv_sqrt_ajustada = function(x, delta){
if (x==0.0){
return(1/sqrt(x+delta))
}else{
return(1/sqrt(x))
}
}
delta <- 10^(-6)
par(mfrow=c(2,4))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(unlist(lapply(df_audio_features[,feature], function(x) inv_sqrt_ajustada(x,delta))), main=paste(feature,"inv_sqt", sep="_"))
}
par(mfrow=c(2,4))
for (feature in transformacion){
hist(df_audio_features[,feature], main=feature)
}
for (feature in transformacion){
hist(sqrt(df_audio_features[,feature]), main=paste(feature,"sqrt", sep="_"))
}
par(mfrow = c(2,1))
hist(df_audio_features[,'loudness_reg_imp'], main='loudness', xlab="")
#hist(sqrt(df_audio_features[,'loudness_reg_imp']), main= 'loudness_sqrt', xlab="")
boxplot(df_audio_features[,'loudness_reg_imp'], horizontal = T)
#boxplot(sqrt(df_audio_features[,'loudness_reg_imp']), horizontal = T)
fit <- lm(loudness~energy+acousticness, data=df_audio_features)
modelo <- fit$coefficients
df_audio_features$loudness_reg_imp <- df_audio_features$loudness
X <- df_audio_features[df_audio_features$loudness>0, c('energy', "acousticness")]
df_audio_features$loudness_reg_imp[df_audio_features$loudness>0] <- modelo[1]+modelo[2]*X[,1]+modelo[3]*X[,2]
summary(df_audio_features[,c("loudness", "loudness_reg_imp")])
summary(fit)
instrumentalness tiene mucho sesgo la variable. Se va a recurrir a una logaritmización de la variable, previa transformación del dominio, haciendo que los valores que son 0, sean en realidad 0.0000001
logaritmo_ajustado = function(x,delta){
if (x==0.0){
return(log(x+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))
par(mfrow =c(2,2))
hist(df_audio_features$instrumentalness, main="insrumentalness", xlab="")
hist(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main='instrumentalness_logadjust', ylim = c(0,130500), xlab = "")
boxplot(df_audio_features$instrumentalness, main="", horizontal = T)
boxplot(unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta))), main="", horizontal=T)
# hist(log(1/sqrt(df_audio_features$instrumentalness+0.00001)),main='log(sqrt(x+))', ylim=c(0,130500), xlab = "")
¿Es útil esta transformación?
delta <- 10^(-6)
df_audio_features$instrumentalness_logadjust <- unlist(lapply(df_audio_features$instrumentalness, function(x) logaritmo_ajustado(x,delta)))
df_chart_tojoin <- df_charts[,c("Track_Name", "Artist", "URL")]
df_chart_tojoin$isinchart <- 1
df_audio_features_tojoin <- df_audio_features[, c("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust")]
join_histogram <- df_audio_features_tojoin %>%
dplyr::select("track_name","artist_key","external_urls_spotify","instrumentalness", "instrumentalness_logadjust") %>%
left_join( df_chart_tojoin %>%
select("Track_Name", "Artist", "URL","isinchart"),
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist",
"external_urls_spotify" = "URL"))
join_histogram$isinchart[is.na(join_histogram$isinchart)] <- 0
join_histogram$isinchart <- factor(join_histogram$isinchart)
h11 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness'])
h11$density <- h11$counts/sum(h11$counts)*100
h12 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness'])
h12$density <- h12$counts/sum(h12$counts)*100
h21 <- hist(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'])
h21$density <- h21$counts/sum(h21$counts)*100
h22 <- hist(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'])
h22$density <- h22$counts/sum(h22$counts)*100
#png("C:/Users/Asus/Desktop/DATA SCIENCE/MAESTRIA/Data Mining/TP/graficos/instrumentalness.png",
# width = 800, height = 800)
par(mfrow = c(3,2))
plot(h11, main='instrumentalness \nchart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h12, main='instrumentalness \nfuera chart', xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h21, main ="instrumentalness_log \nchart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
plot(h22, main ="instrumentalness_log \nfuera chart", xlab="", ylab="Porcentage", freq=FALSE, col='grey', ylim = c(0,100))
boxplot(join_histogram[join_histogram$isinchart==1,'instrumentalness_logadjust'], main="instrumentalness_log chart", horizontal = T)
boxplot(join_histogram[join_histogram$isinchart==0,'instrumentalness_logadjust'], main="instrumentalness_log fuera chart", horizontal = T)
#dev.off()
################################
## FILTRAMOS OUTLIERS POR Z-SCORE para 'danceability', 'tempo', 'valence'
##############################
#z-score para variables que tienden a la normal
#filtro features numericos
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
df_audio_features_zscore_media <- df_audio_features[,features_continuas_media]
#normalizo z score con las variables que tienden a la normal
zscore_cols <- c()
for(col in names(df_audio_features_zscore_media)){
name_col <- paste('zscore_',col, sep = "")
zscore_cols <- append(zscore_cols, name_col)
media <- mean(df_audio_features_zscore_media[,col])
stdv <- sd(df_audio_features_zscore_media[,col])
df_audio_features_zscore_media[,name_col] <- (df_audio_features_zscore_media[,col] - media)/stdv
}
par(mfrow=c(1,length(zscore_cols)))
lapply(zscore_cols, function(col) boxplot(df_audio_features_zscore_media[,col],xlab=col))
#variable: danceability
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_danceability> umbral_zscore) | (df_audio_features_zscore_media$zscore_danceability< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, danceability ) %>%
arrange(-danceability)
#variable: Tempo
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_tempo> umbral_zscore) | (df_audio_features_zscore_media$zscore_tempo< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, tempo ) %>%
arrange(-tempo)
#variable: valence
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_valence> umbral_zscore) | (df_audio_features_zscore_media$zscore_valence< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, valence ) %>%
arrange(-valence)
################################
## FILTRAMOS OUTLIERS POR Z-SCORE MODIFICADO para 'acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets'
##############################
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
df_audio_features_zscore_mediana <- df_audio_features[,features_continuas_mediana]
zscoremodif_cols <- c()
for(col in names(df_audio_features_zscore_mediana)){
name_col <- paste('zscoremodif_',col, sep = "")
zscoremodif_cols <- append(zscoremodif_cols, name_col)
med = median(df_audio_features_zscore_mediana[,col], na.rm = T)
MAD = median(abs(df_audio_features_zscore_mediana[,col] - med), na.rm = T)
df_audio_features_zscore_mediana[, name_col] <- 0.6745 * (df_audio_features_zscore_mediana[,col] - med) / MAD
}
par(mfrow=c(4,2))
lapply(zscoremodif_cols, function(col) boxplot(df_audio_features_zscore_mediana[,col],xlab=col, horizontal = T))
Instrumentalnessinstrumentalness <- c("instrumentalness", "zscoremodif_instrumentalness")
x <- df_audio_features$instrumentalness
n_interv <- 10
intervalos <- round(seq(0,max(x),by=(max(x)-min(x))/n_interv),2)
labs <- c()
for (i in 1:n_interv){
lab <- paste(intervalos[i],intervalos[i+1], sep='\n')
labs <- append(labs, lab)
}
bins <- cut(x, n_interv, include.lowest = TRUE, labels = labs)
barplot(table(bins))
Hacemos K-means para poder discretizar la variable.
sse <- c()
for (k in 2:6){
clusters <- kmeans(df_audio_features$instrumentalness,centers = k, iter.max = 10, nstart = k)
sse <- append(sse, clusters$tot.withinss)
}
plot(2:6,sse, type = 'l', xlab='Cantidad de Clusters', ylab='Suma Error Cuadrático')
#k=3
clusters3 <- kmeans(df_audio_features$instrumentalness,centers = 3, iter.max = 10, nstart = 3)
df_audio_features$clusters <- factor(clusters3$cluster)
lev <- levels(df_audio_features$clusters)
labs <- c()
for (i in lev){
min <- min(df_audio_features$instrumentalness[df_audio_features$clusters==i])
max <- max(df_audio_features$instrumentalness[df_audio_features$clusters==i])
lab <- paste(min,max, sep=' - ')
labs <- append(labs, lab)
}
labs
# barplot(table(factor(clusters3$cluster)), labels = labs)
¿Qué características tienen las canciones que están en el chart? ¿Cual es el patrón comun que tienen las canciones más escuchadas? (ver dispersiones, media, grafico comparativo)
#funcion para escalar variable
scale_vble <- function(x){
(x - mean(x, na.rm = T))/sd(x, na.rm = T)
}
#anti_join
anti_join_audio_charts <- df_audio_features %>%
select("artist_name","artist_all", "artist_key",
"track_name", "external_urls_spotify", "album_name", "album_release_year",
all_of(features_continuas), all_of(features_categoricas)) %>%
anti_join( df_charts %>%
select( "Track_Name", "Artist", "URL"),
by = c("external_urls_spotify" ="URL",
"artist_key" ="Artist" ))
# by = c("track_name" = "Track_Name"))
anti_join_audio_charts_complete <- na.omit(anti_join_audio_charts)
anti_join_audio_charts_complete_scale <- anti_join_audio_charts_complete %>%
distinct() %>%
select(features_continuas) %>%
mutate_all(scale_vble)
nrow(anti_join_audio_charts_complete_scale)
join_audio_charts %>%
group_by(artist_name) %>%
dplyr::summarise(n = n()) %>%
arrange(-n)
join_audio_charts %>%
group_by(track_name, artist_name,external_urls_spotify) %>%
dplyr::summarise(n = n()) %>%
arrange(-n) %>%
select(track_name, n, everything(.))
# cantidad de semanas que estuvieron en el chart
df_charts %>%
mutate(week_start=as.Date(week_start),
week_end = as.Date(week_end),
week_year = (year(week_start))) %>%
arrange(Artist, Track_Name) %>%
group_by(Artist, Track_Name, URL) %>%
dplyr:: summarise( day_in = min(week_start),
year_in = year(day_in),
day_max = max(week_end),
year_max = year(day_max),
duracion_chart_dias = day_max-day_in,
duracion_chart_anio = year_max - year_in) %>%
arrange(Artist)
#prueba igal de transformacion y test de normalidad
for (i in features_continuas){
x <- log10(df_chart_w_lyrics[,i])
x <- shapiro.test(x)
z <- x$p.value
print(z)
}
[1] 1.85241e-21
[1] 1.167246e-23
[1] 7.44622e-11
[1] 1.256851e-30
[1] NaN
[1] 3.287572e-17
NaNs producedError in shapiro.test(x) : sample size must be between 3 and 5000